## Documentation for Data Merging ----------------------------------------------
# This R script reports how I merged petitions (author's FOIA), lobbying 
# (LobbyView), and financial (Orbis) data, creating data-merge-91-17.RData
# See Appendix B of the manuscript for further details and discussions.


## IMPORTANT NOTE --------------------------------------------------------------
# Orbis data are proprietary and prohibited from sharing publicly. Hence, this 
# R script, which relies on Orbis data in the merge, serves only for JOP 
# reporting purposes and can not be run.


## setup -----------------------------------------------------------------------
# clean slate
rm(list = ls())
date()

# load packages
pkg <- c("tidyverse")

lapply(pkg, require, character.only = TRUE)

# set main directory
MAIN_DIR <- "~/Dropbox/Research/dissertation/mnc"


## load cleaned data -----------------------------------------------------------
load(file = paste(MAIN_DIR, "lv-imm-fy-99-17.RData", sep = "/")) # immigration lobbying data
load(file = paste(MAIN_DIR, "lv-tob-fy-17.RData", sep = "/")) # tobacco lobbying data
load(file = paste(MAIN_DIR, "lv-bev-fy-17.RData", sep = "/")) # beverage lobbying data
load(file = paste(MAIN_DIR, "lv-cdt-fy-17.RData", sep = "/")) # commodities lobbying data
load(file = paste(MAIN_DIR, "sup-imm-lob-firms.RData", sep = "/")) # supplementary immigration lobbying firms
load(file = paste(MAIN_DIR, "skilled-pet-match-a-fy-90-17.RData", sep = "/")) # petitions data
load(file = paste(MAIN_DIR, "orbis-skilled-matched-all.RData", sep = "/")) # Orbis data


## create vars from lobbying data ----------------------------------------------
# create vars
lob.df <- lob.img.firms.years %>%
  mutate(lob_img_2017 = ifelse(year == 2017 & n_img_rep_year >= 1, 1, 0),
         lob_img_2016 = ifelse(year == 2016 & n_img_rep_year >= 1, 1, 0),
         
         lob_wh_eop_2017 = ifelse(year == 2017 & (lob_wh == 1 | lob_eop == 1), 1, 0),
         lob_uscis_2017 = ifelse(year == 2017 & lob_uscis == 1, 1, 0),
         lob_dhs_2017 = ifelse(year == 2017 & lob_dhs == 1, 1, 0),
         lob_dol_2017 = ifelse(year == 2017 & lob_dol == 1, 1, 0),
         lob_only_congress_2017 = ifelse(
             year == 2017 & 
             lob_congress == 1 & 
             lob_dos == 0 & 
             lob_dhs == 0 &
             lob_uscis == 0 &
             lob_ice == 0 &
             lob_cbp == 0 &
             lob_doc == 0 &
             lob_dol == 0 &
             lob_dod == 0 &
             lob_doj == 0 &
             lob_treas == 0 &
             lob_wh == 0 &
             lob_eop == 0 &
             lob_ustr == 0 &
             lob_omb == 0 &
             lob_ostp == 0 &
             lob_nsc == 0 &
             lob_nec == 0 &
             lob_hhs == 0 &
             lob_vp == 0 &
             lob_usda == 0 &
             lob_cms == 0 &
             lob_dot == 0 &
             lob_edu == 0 &
             lob_tda == 0 &
             lob_eta == 0 &
             lob_marad == 0 &
             lob_tsa == 0, 
           1, 0),
         
         lob_text_hs_h1b_visa_2017 = ifelse(year == 2017 & (text_hs_h1b_visa == 1), 1, 0),
         lob_text_hs_h1b_visa_wh_eop_2017 = ifelse(year == 2017 & text_hs_h1b_visa == 1 & lob_wh_eop_2017 == 1, 1, 0),
         lob_text_hs_h1b_visa_uscis_2017 = ifelse(year == 2017 & text_hs_h1b_visa == 1 & lob_uscis_2017 == 1, 1, 0),
         lob_text_hs_h1b_visa_dhs_2017 = ifelse(year == 2017 & text_hs_h1b_visa == 1 & lob_dhs_2017 == 1, 1, 0),
         lob_text_hs_h1b_visa_dol_2017 = ifelse(year == 2017 & text_hs_h1b_visa == 1 & lob_dol_2017 == 1, 1, 0),
         lob_text_hs_h1b_visa_only_congress_2017 = ifelse(year == 2017 & text_hs_h1b_visa == 1 & lob_only_congress_2017 == 1, 1, 0)) %>%
  group_by(client_bvdid) %>%
  mutate(lob_img_2017 = ifelse(any(lob_img_2017 == 1), 1, 0),
         lob_img_2016 = ifelse(any(lob_img_2016 == 1), 1, 0),
         
         lob_wh_eop_2017 = ifelse(any(lob_wh_eop_2017 == 1), 1, 0),
         lob_uscis_2017 = ifelse(any(lob_uscis_2017 == 1), 1, 0),
         lob_dhs_2017 = ifelse(any(lob_dhs_2017 == 1), 1, 0),
         lob_dol_2017 = ifelse(any(lob_dol_2017 == 1), 1, 0),
         lob_only_congress_2017 = ifelse(any(lob_only_congress_2017 == 1), 1, 0),

         lob_text_hs_h1b_visa_2017 = ifelse(any(lob_text_hs_h1b_visa_2017 == 1), 1, 0),
         lob_text_hs_h1b_visa_wh_eop_2017 = ifelse(any(lob_text_hs_h1b_visa_wh_eop_2017 == 1), 1, 0),
         lob_text_hs_h1b_visa_uscis_2017 = ifelse(any(lob_text_hs_h1b_visa_uscis_2017 == 1), 1, 0),
         lob_text_hs_h1b_visa_dhs_2017 = ifelse(any(lob_text_hs_h1b_visa_dhs_2017 == 1), 1, 0),
         lob_text_hs_h1b_visa_dol_2017 = ifelse(any(lob_text_hs_h1b_visa_dol_2017 == 1), 1, 0),
         lob_text_hs_h1b_visa_only_congress_2017 = ifelse(any(lob_text_hs_h1b_visa_only_congress_2017 == 1), 1, 0)) %>%
  ungroup() %>%
  select(client_bvdid, 
         year, 
         client_name, 
         client_naics, 
         est_img_expense_fy, 
         n_img_rep_year, 
         lob_img_2017, lob_img_2016, 
         lob_wh_eop_2017, 
         lob_uscis_2017, 
         lob_dhs_2017, 
         lob_dol_2017,
         lob_only_congress_2017,
         lob_text_hs_h1b_visa_2017, 
         lob_text_hs_h1b_visa_wh_eop_2017,
         lob_text_hs_h1b_visa_uscis_2017,
         lob_text_hs_h1b_visa_dhs_2017,
         lob_text_hs_h1b_visa_dol_2017,
         lob_text_hs_h1b_visa_only_congress_2017)

# how many firms lobbied on immigration between 1999-2017? 663
n_distinct(lob.df$client_bvdid)

# how many firms lobbied on immigration in 2017? 127
lob.df %>%
  filter(year == 2017 & lob_img_2017 == 1) %>%
  pull(client_bvdid) %>%
  n_distinct()


## merge petitions and Orbis data ----------------------------------------------
merge.1 <- left_join(skilled.pet.match.a.fy, 
                     orbis.df %>%
                       select(-bvd_name),
                     by = c("bvd_id", "year"))


## merge in lobbying data ------------------------------------------------------
merge.2 <- left_join(merge.1, 
                     lob.df,
                     by = c("bvd_id" = "client_bvdid", 
                            "year"))

merge.2 <- left_join(merge.2, 
                     lob.tob.firms.years,
                     by = c("bvd_id" = "client_bvd"))

merge.2 <- left_join(merge.2, 
                     lob.bev.firms.years,
                     by = c("bvd_id" = "client_bvd"))

merge.2 <- left_join(merge.2, 
                     lob.cdt.firms.years,
                     by = c("bvd_id" = "client_bvd"))


## final cleaning --------------------------------------------------------------
names(merge.2)

# subset vars
data.merge <- merge.2 %>%
  select(bvd_id, 
         bvd_name, 
         year, 
         
         h1b_pet_name, l1_pet_name,
         h1b_pet_state, l1_pet_state,
         h1b_pet_zcta, l1_pet_zcta,
         
         h1b_pet_firm_year_tot, 
         h1b_pet_firm_year_tot_exclude_pending,
         l1_pet_firm_year_tot, 
         l1_pet_firm_year_tot_exclude_pending,
         
         h1b_approve, h1b_approve_rate, 
         h1b_deny, h1b_deny_rate, 
         h1b_revoke, h1b_revoke_rate, 
         h1b_admin_close, h1b_admin_close_rate, 
         h1b_pending, h1b_pending_rate, 
         
         client_name, 
         client_naics, 
         n_img_rep_year, 
         est_img_expense_fy, 
         
         lob_img_2017, 
         lob_img_2016, 
         
         lob_uscis_2017, 
         lob_dhs_2017, 
         lob_dol_2017, 
         lob_wh_eop_2017, 
         lob_only_congress_2017,  
         
         lob_text_hs_h1b_visa_2017, 
         lob_text_hs_h1b_visa_uscis_2017, 
         lob_text_hs_h1b_visa_dhs_2017,
         lob_text_hs_h1b_visa_dol_2017, 
         lob_text_hs_h1b_visa_wh_eop_2017, 
         lob_text_hs_h1b_visa_only_congress_2017,
         
         lob_tob_2017, 
         lob_bev_2017, 
         lob_cdt_2017,
         
         # Orbis data
         naics_core,
         naics_core_descrip,
         size,
         public,
         sales,
         n_employ) %>%
  arrange(bvd_id, year) 

# impute zeros for years with no lobbying
data.merge <- data.merge %>%
  mutate(n_img_rep_year = ifelse(is.na(n_img_rep_year), 0, n_img_rep_year),
         est_img_expense_fy = ifelse(is.na(est_img_expense_fy), 0, est_img_expense_fy)) 

# fill in missing values for time-invariant vars
data.merge <- data.merge %>%
  group_by(bvd_id) %>%
  fill(lob_img_2017, .direction = "downup") %>%
  fill(lob_img_2016, .direction = "downup") %>%
  fill(lob_uscis_2017, .direction = "downup") %>%
  fill(lob_dhs_2017, .direction = "downup") %>%
  fill(lob_dol_2017, .direction = "downup") %>%
  fill(lob_wh_eop_2017, .direction = "downup") %>%
  fill(lob_only_congress_2017, .direction = "downup") %>%
  fill(lob_text_hs_h1b_visa_2017, .direction = "downup") %>%
  fill(lob_text_hs_h1b_visa_uscis_2017, .direction = "downup") %>%
  fill(lob_text_hs_h1b_visa_dhs_2017, .direction = "downup") %>%
  fill(lob_text_hs_h1b_visa_dol_2017, .direction = "downup") %>%
  fill(lob_text_hs_h1b_visa_wh_eop_2017, .direction = "downup") %>%
  fill(lob_text_hs_h1b_visa_only_congress_2017, .direction = "downup") %>%
  fill(client_naics, .direction = "downup") %>%
  fill(naics_core, .direction = "up") %>%
  fill(naics_core_descrip, .direction = "up") %>%
  fill(size, .direction = "up") %>%
  fill(public, .direction = "up") %>%
  ungroup() %>%
  mutate(lob_img_2017 = ifelse(is.na(lob_img_2017), 0, lob_img_2017),
         lob_img_2016 = ifelse(is.na(lob_img_2016), 0, lob_img_2016),
         
         lob_uscis_2017 = ifelse(is.na(lob_uscis_2017), 0, lob_uscis_2017),
         lob_dhs_2017 = ifelse(is.na(lob_dhs_2017), 0, lob_dhs_2017),
         lob_dol_2017 = ifelse(is.na(lob_dol_2017), 0, lob_dol_2017),
         lob_wh_eop_2017 = ifelse(is.na(lob_wh_eop_2017), 0, lob_wh_eop_2017),
         lob_only_congress_2017 = ifelse(is.na(lob_only_congress_2017), 0, lob_only_congress_2017),
         
         lob_text_hs_h1b_visa_2017 = ifelse(is.na(lob_text_hs_h1b_visa_2017), 0, lob_text_hs_h1b_visa_2017),
         lob_text_hs_h1b_visa_uscis_2017 = ifelse(is.na(lob_text_hs_h1b_visa_uscis_2017), 0, lob_text_hs_h1b_visa_uscis_2017),
         lob_text_hs_h1b_visa_dhs_2017 = ifelse(is.na(lob_text_hs_h1b_visa_dhs_2017), 0, lob_text_hs_h1b_visa_dhs_2017),
         lob_text_hs_h1b_visa_dol_2017 = ifelse(is.na(lob_text_hs_h1b_visa_dol_2017), 0, lob_text_hs_h1b_visa_dol_2017),
         lob_text_hs_h1b_visa_wh_eop_2017 = ifelse(is.na(lob_text_hs_h1b_visa_wh_eop_2017), 0, lob_text_hs_h1b_visa_wh_eop_2017),
         lob_text_hs_h1b_visa_only_congress_2017 = ifelse(is.na(lob_text_hs_h1b_visa_only_congress_2017), 0, lob_text_hs_h1b_visa_only_congress_2017),
  
         lob_tob_2017 = ifelse(is.na(lob_tob_2017), 0, lob_tob_2017),
         lob_bev_2017 = ifelse(is.na(lob_bev_2017), 0, lob_bev_2017),
         lob_cdt_2017 = ifelse(is.na(lob_cdt_2017), 0, lob_cdt_2017),
         
         lob_only_tob_2017 = ifelse(lob_tob_2017 == 1 & lob_img_2017 == 0, 1, 0),
         lob_only_bev_2017 = ifelse(lob_bev_2017 == 1 & lob_img_2017 == 0, 1, 0),
         lob_only_cdt_2017 = ifelse(lob_cdt_2017 == 1 & lob_img_2017 == 0, 1, 0),
         
         public = ifelse(is.na(public), 0, public),
         
         post_2017 = if_else(year >= 2017, 1, 0),
         post_2004 = if_else(year >= 2004, 1, 0),
         
         year_1991 = if_else(year == 1991, 1, 0),
         year_1992 = if_else(year == 1992, 1, 0),
         year_1993 = if_else(year == 1993, 1, 0),
         year_1994 = if_else(year == 1994, 1, 0),
         year_1995 = if_else(year == 1995, 1, 0),
         year_1996 = if_else(year == 1996, 1, 0),
         year_1997 = if_else(year == 1997, 1, 0),
         year_1998 = if_else(year == 1998, 1, 0),
         year_1999 = if_else(year == 1999, 1, 0),
         year_2000 = if_else(year == 2000, 1, 0),
         year_2001 = if_else(year == 2001, 1, 0),
         year_2002 = if_else(year == 2002, 1, 0),
         year_2003 = if_else(year == 2003, 1, 0),
         year_2004 = if_else(year == 2004, 1, 0),
         year_2005 = if_else(year == 2005, 1, 0),
         year_2006 = if_else(year == 2006, 1, 0),
         year_2007 = if_else(year == 2007, 1, 0),
         year_2008 = if_else(year == 2008, 1, 0),
         year_2009 = if_else(year == 2009, 1, 0),
         year_2010 = if_else(year == 2010, 1, 0),
         year_2011 = if_else(year == 2011, 1, 0),
         year_2012 = if_else(year == 2012, 1, 0),
         year_2013 = if_else(year == 2013, 1, 0),
         year_2014 = if_else(year == 2014, 1, 0),
         year_2015 = if_else(year == 2015, 1, 0),
         year_2016 = if_else(year == 2016, 1, 0),
         year_2017 = if_else(year == 2017, 1, 0))

# Additional petitioning firms filed reports in 2017 containing immigration-related keywords but not under IMM
# merge in data 
data.merge <- left_join(data.merge, 
                        sup.imm.lob.firms %>%
                          select(-client_name),
                        by = "bvd_id")

# create augmented immigration lobbying measures
data.merge <- data.merge %>%
  mutate(lob_img_2017_aug = ifelse(is.na(lob_img_2017_aug), lob_img_2017, lob_img_2017_aug),
         lob_text_hs_h1b_visa_2017_aug = ifelse(is.na(lob_text_hs_h1b_visa_2017_aug), lob_text_hs_h1b_visa_2017, lob_text_hs_h1b_visa_2017_aug),
         lob_text_hs_h1b_visa_uscis_2017_aug = ifelse(is.na(lob_text_hs_h1b_visa_uscis_2017_aug), lob_text_hs_h1b_visa_uscis_2017, lob_text_hs_h1b_visa_uscis_2017_aug),
         lob_text_hs_h1b_visa_dol_2017_aug = ifelse(is.na(lob_text_hs_h1b_visa_dol_2017_aug), lob_text_hs_h1b_visa_dol_2017, lob_text_hs_h1b_visa_dol_2017_aug),
         lob_text_hs_h1b_visa_dhs_2017_aug = ifelse(is.na(lob_text_hs_h1b_visa_dhs_2017_aug), lob_text_hs_h1b_visa_dhs_2017, lob_text_hs_h1b_visa_dhs_2017_aug),
         lob_text_hs_h1b_visa_wh_eop_2017_aug = ifelse(is.na(lob_text_hs_h1b_visa_wh_eop_2017_aug), lob_text_hs_h1b_visa_wh_eop_2017, lob_text_hs_h1b_visa_wh_eop_2017_aug),
         lob_text_hs_h1b_visa_only_congress_2017_aug = ifelse(is.na(lob_text_hs_h1b_visa_only_congress_2017_aug), lob_text_hs_h1b_visa_only_congress_2017, lob_text_hs_h1b_visa_only_congress_2017_aug))

# check final data
glimpse(data.merge)
summary(data.merge)

## save ------------------------------------------------------------------------
save(data.merge, 
     file = paste(MAIN_DIR, "data-merge-91-17.RData", sep = "/"))  

